library(car)
library(mosaic)
library(DT)
library(tidyverse)

Read in the Data

files <- dir()
rdat <- read.csv(grep(".csv", files, value=TRUE), header=TRUE)
as_tibble(rdat)
# A tibble: 230 x 6
        y    x1     x2    x3    x4    x5
    <dbl> <int>  <dbl> <dbl> <dbl> <int>
 1  0.206     1 0.121  -7.62  1.41     1
 2  0.173     0 0.288   6.66  1.24     0
 3 -0.940     1 0.346  15.1  -3.41     0
 4 -0.900     0 0.0425 14.0  -1.94     0
 5  1.62      1 0.611  -1.38  4.12     1
 6  0.253     0 0.309   3.66  5.53     1
 7 -1.07      1 0.322   1.64 -3.67     0
 8 -0.493     0 0.106   8.74 -2.86     0
 9  0.802     1 0.206   8.61  4.76     0
10  0.538     0 0.227   6.40  5.39     1
# … with 220 more rows

Create first Pairs Plot

pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))

Start with x4

lm1 <- lm(y ~ x4, data=rdat)
summary(lm1)
## 
## Call:
## lm(formula = y ~ x4, data = rdat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14252 -0.18749  0.02534  0.17983  0.75059 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.083896   0.020633   4.066 6.59e-05 ***
## x4          0.191594   0.006012  31.870  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3124 on 228 degrees of freedom
## Multiple R-squared:  0.8167, Adjusted R-squared:  0.8159 
## F-statistic:  1016 on 1 and 228 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","steelblue","red","green3","black"))
pairs(cbind(R=lm1$res, Fit=lm1$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=as.factor(rdat$x5))

Add x1

lm1 <- lm(y ~ x4, data=rdat)
summary(lm1)
## 
## Call:
## lm(formula = y ~ x4, data = rdat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14252 -0.18749  0.02534  0.17983  0.75059 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.083896   0.020633   4.066 6.59e-05 ***
## x4          0.191594   0.006012  31.870  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3124 on 228 degrees of freedom
## Multiple R-squared:  0.8167, Adjusted R-squared:  0.8159 
## F-statistic:  1016 on 1 and 228 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","steelblue","red","green3","black"))
pairs(cbind(R=lm1$res, Fit=lm1$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=interaction(rdat$x1,rdat$x5))

Try x1 and x5

lm2 <- lm(y ~ x4*x1*x5, data=rdat)
summary(lm2)
## 
## Call:
## lm(formula = y ~ x4 * x1 * x5, data = rdat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.98128 -0.18428  0.01031  0.20109  0.67184 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.071795   0.038542   1.863 0.063818 .  
## x4           0.213627   0.011092  19.259  < 2e-16 ***
## x1          -0.046025   0.054332  -0.847 0.397849    
## x5           0.004851   0.056237   0.086 0.931343    
## x4:x1       -0.013168   0.015555  -0.847 0.398171    
## x4:x5       -0.062079   0.015917  -3.900 0.000127 ***
## x1:x5        0.134686   0.079941   1.685 0.093429 .  
## x4:x1:x5     0.059754   0.023540   2.538 0.011820 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3009 on 222 degrees of freedom
## Multiple R-squared:  0.8343, Adjusted R-squared:  0.8291 
## F-statistic: 159.7 on 7 and 222 DF,  p-value: < 2.2e-16

Reduce

lm3 <- lm(y ~ x4 + x4:x5, data=rdat)
summary(lm3)
## 
## Call:
## lm(formula = y ~ x4 + x4:x5, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0730 -0.1877  0.0247  0.1681  0.8251 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.08497    0.02034   4.178  4.2e-05 ***
## x4           0.20634    0.00795  25.955  < 2e-16 ***
## x4:x5       -0.03310    0.01190  -2.782  0.00586 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3078 on 227 degrees of freedom
## Multiple R-squared:  0.8227, Adjusted R-squared:  0.8212 
## F-statistic: 526.7 on 2 and 227 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","steelblue","red","green3","black"))
pairs(cbind(R=lm3$res, Fit=lm3$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=as.factor(rdat$x5))

Try some things…

I played with a few ideas until I got this…

lm4 <- lm(y ~ x4 + x4:x5 + 
            x1:x5 + x1:x4:x5, data=rdat)
summary(lm4)
## 
## Call:
## lm(formula = y ~ x4 + x4:x5 + x1:x5 + x1:x4:x5, data = rdat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96151 -0.18246  0.01235  0.18386  0.69161 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.056870   0.022576   2.519  0.01246 *  
## x4           0.206687   0.007752  26.661  < 2e-16 ***
## x4:x5       -0.055140   0.013775  -4.003 8.49e-05 ***
## x5:x1        0.108436   0.047561   2.280  0.02355 *  
## x4:x5:x1     0.046587   0.017622   2.644  0.00878 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3002 on 225 degrees of freedom
## Multiple R-squared:  0.8329, Adjusted R-squared:   0.83 
## F-statistic: 280.5 on 4 and 225 DF,  p-value: < 2.2e-16
rdat2 <- cbind(rdat, fit=lm4$fitted.values)
ggplot(rdat2, aes(x=x4, color=interaction(x1,x5))) + 
  geom_point(aes(y=y), pch=1) + 
  geom_point(aes(y=fit), pch=16, cex=0.5) +
  geom_smooth(aes(y=y), method="lm") + 
  facet_wrap(~interaction(x1,x5))
## `geom_smooth()` using formula 'y ~ x'

Final Guess

lm

final.lm <- lm(y ~ x4 + x4:x5 + 
            x1:x5 + x1:x4:x5, data=rdat)
summary(final.lm)
## 
## Call:
## lm(formula = y ~ x4 + x4:x5 + x1:x5 + x1:x4:x5, data = rdat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96151 -0.18246  0.01235  0.18386  0.69161 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.056870   0.022576   2.519  0.01246 *  
## x4           0.206687   0.007752  26.661  < 2e-16 ***
## x4:x5       -0.055140   0.013775  -4.003 8.49e-05 ***
## x5:x1        0.108436   0.047561   2.280  0.02355 *  
## x4:x5:x1     0.046587   0.017622   2.644  0.00878 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3002 on 225 degrees of freedom
## Multiple R-squared:  0.8329, Adjusted R-squared:   0.83 
## F-statistic: 280.5 on 4 and 225 DF,  p-value: < 2.2e-16

Graph

palette(c("skyblue","skyblue","green","purple","steelblue","red","green3","black"))
plot(y ~ x4, data=rdat, col=interaction(x1,x5))
points(final.lm$fit ~ x4, data=rdat, col=interaction(x1,x5), pch=16, cex=0.5)

b <- coef(final.lm)

x1=0; x5=0;
curve(b[1] + b[2]*x4 + b[3]*x4*x5 + b[4]*x5*x1 + b[5]*x4*x5*x1, add=TRUE, xname="x4", col=palette()[1])

x1=0; x5=1;
curve(b[1] + b[2]*x4 + b[3]*x4*x5 + b[4]*x5*x1 + b[5]*x4*x5*x1, add=TRUE, xname="x4", col=palette()[3])

x1=1; x5=1;
curve(b[1] + b[2]*x4 + b[3]*x4*x5 + b[4]*x5*x1 + b[5]*x4*x5*x1, add=TRUE, xname="x4", col=palette()[4])

Math Model

\[ Y_i = \beta_0 + \beta_1 X_{4i} + \beta_2 X_{4i} X_{5i} + \beta_3 X_{5i}X_{1i} + \beta_4 X_{4i} X_{5i} X_{1i} + \epsilon_i \]